{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fastNLP import DataSet\n",
    "data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"],\n",
    "        'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']],\n",
    "        'seq_len': [6, 3, 3]}\n",
    "dataset = DataSet(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------------------------+-------------------------------+---------+\n",
      "| raw_words                     | words                         | seq_len |\n",
      "+-------------------------------+-------------------------------+---------+\n",
      "| This is the first instance... | ['this', 'is', 'the', 'fir... | 6       |\n",
      "| Second instance .             | ['Second', 'instance', '.'... | 3       |\n",
      "| Third instance .              | ['Third', 'instance', '.']... | 3       |\n",
      "+-------------------------------+-------------------------------+---------+\n"
     ]
    }
   ],
   "source": [
    "print(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------------------------+-------------------------------+---------+\n",
      "| raw_words                     | words                         | seq_len |\n",
      "+-------------------------------+-------------------------------+---------+\n",
      "| This is the first instance... | ['this', 'is', 'the', 'fir... | 6       |\n",
      "| Second instance .             | ['Second', 'instance', '.'... | 3       |\n",
      "| Third instance .              | ['Third', 'instance', '.']... | 3       |\n",
      "| This is the first instance... | ['this', 'is', 'the', 'fir... | 6       |\n",
      "+-------------------------------+-------------------------------+---------+\n"
     ]
    }
   ],
   "source": [
    "from fastNLP import Instance\n",
    "instance = Instance(raw_words=\"This is the first instance\",\n",
    "                    words=['this', 'is', 'the', 'first', 'instance', '.'],\n",
    "                        seq_len=6)\n",
    "dataset.append(instance)\n",
    "print(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------------------------+-------------------------------+---------+\n",
      "| raw_words                     | words                         | seq_len |\n",
      "+-------------------------------+-------------------------------+---------+\n",
      "| This is the first instance... | ['this', 'is', 'the', 'fir... | 6       |\n",
      "+-------------------------------+-------------------------------+---------+\n",
      "+-------------------+-------------------------------+---------+\n",
      "| raw_words         | words                         | seq_len |\n",
      "+-------------------+-------------------------------+---------+\n",
      "| Second instance . | ['Second', 'instance', '.'... | 3       |\n",
      "+-------------------+-------------------------------+---------+\n",
      "+------------------+-------------------------------+---------+\n",
      "| raw_words        | words                         | seq_len |\n",
      "+------------------+-------------------------------+---------+\n",
      "| Third instance . | ['Third', 'instance', '.']... | 3       |\n",
      "+------------------+-------------------------------+---------+\n",
      "+-------------------------------+-------------------------------+---------+\n",
      "| raw_words                     | words                         | seq_len |\n",
      "+-------------------------------+-------------------------------+---------+\n",
      "| This is the first instance... | ['this', 'is', 'the', 'fir... | 6       |\n",
      "+-------------------------------+-------------------------------+---------+\n"
     ]
    }
   ],
   "source": [
    "for instance in dataset:\n",
    "    print(instance)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "#vocabulary:\n",
      "Vocabulary(['this', 'is', 'the', 'first', 'instance']...)\n",
      "#dataset:\n",
      "+-------------------------------+--------------------+---------+\n",
      "| raw_words                     | words              | seq_len |\n",
      "+-------------------------------+--------------------+---------+\n",
      "| This is the first instance... | [4, 5, 6, 7, 2, 3] | 6       |\n",
      "| Second instance .             | [8, 2, 3]          | 3       |\n",
      "| Third instance .              | [9, 2, 3]          | 3       |\n",
      "| This is the first instance... | [4, 5, 6, 7, 2, 3] | 6       |\n",
      "+-------------------------------+--------------------+---------+\n"
     ]
    }
   ],
   "source": [
    "from fastNLP import Vocabulary\n",
    "vocab = Vocabulary()\n",
    "#  从该dataset中的chars列建立词表\n",
    "vocab.from_dataset(dataset, field_name='words')\n",
    "#  使用vocabulary将chars列转换为index\n",
    "vocab.index_dataset(dataset, field_name='words')\n",
    "\n",
    "print('#vocabulary:')\n",
    "print(vocab)\n",
    "print('#dataset:')\n",
    "print(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "this\n",
      "2\n",
      "1\n",
      "1\n"
     ]
    }
   ],
   "source": [
    "print(vocab.to_word(4))\n",
    "print(vocab.to_index('instance'))\n",
    "print(vocab.to_index('instances'))\n",
    "print(vocab.to_index('ssinstancess'))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
